Data

Environmental data (n=10) Medical data (n=7)

You can find metadata here: https://github.com/HeatherWelch/melanoma

Scatterplots

With individual linear models fit to each pair

source('/Users/heatherwelch/Dropbox/melenoma/melanoma_GitHub/utilities/load_libraries.R')
modDF=read.csv("/Users/heatherwelch/Dropbox/melenoma/melanoma_GitHub/model_data/data_01_30_20.csv")
env=modDF %>% dplyr::select(SEER_rate,anRange_temperature,cancer_gov_UV_exposure,mean_cloud,elevation,mean_temperature,seasonality_cloud,seasonality_temperature,sun_exposure,UV_daily_dose,UV_irradiance)
master_env=env %>% gather(variable, value,-SEER_rate)

C3=ggplot(master_env,aes(x=SEER_rate,y=value),color="black")+geom_point(size=1)+stat_smooth(se=F,method = "lm",formula = y~x)+
  theme(text = element_text(size=8),axis.text = element_text(size=8),plot.title = element_text(hjust=0,size=8),legend.position=c(.15,.3),legend.justification = c(.9,.9),legend.key.size = unit(.5,'lines'))+
  theme(legend.background = element_blank(),legend.box.background = element_rect(colour = NA),legend.margin=unit(0.3, "lines"))+theme_classic()+
  ggtitle("Environment")+
  ylab("Predictor")+xlab("Melanoma rate")+facet_wrap(~variable,scales="free")

C3

jur=modDF %>% dplyr::select(-c(anRange_temperature,cancer_gov_UV_exposure,mean_cloud,elevation,mean_temperature,seasonality_cloud,seasonality_temperature,sun_exposure,UV_daily_dose,UV_irradiance))
master_jur=jur %>% gather(variable, value,-SEER_rate)

C3=ggplot(master_jur,aes(x=SEER_rate,y=value),color="black")+geom_point(size=1)+stat_smooth(se=F,method = "lm",formula = y~x)+
  theme(text = element_text(size=8),axis.text = element_text(size=8),plot.title = element_text(hjust=0,size=8),legend.position=c(.15,.3),legend.justification = c(.9,.9),legend.key.size = unit(.5,'lines'))+
  theme(legend.background = element_blank(),legend.box.background = element_rect(colour = NA),legend.margin=unit(0.3, "lines"))+theme_classic()+
  ggtitle("Medical")+
  ylab("Predictor")+xlab("Melanoma rate")+facet_wrap(~variable,scales="free")

C3

Corplot

M <- cor(modDF)
corrplot(M,type="upper",order="hclust",outline = T,tl.col="black",tl.cex = .9)

Generalized linear models (GLM)

Environmental GLMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=glm(SEER_rate~anRange_temperature+cancer_gov_UV_exposure+mean_cloud+elevation+mean_temperature+seasonality_cloud+seasonality_temperature+sun_exposure+UV_daily_dose+UV_irradiance,data=modDF)
summary(a)
## 
## Call:
## glm(formula = SEER_rate ~ anRange_temperature + cancer_gov_UV_exposure + 
##     mean_cloud + elevation + mean_temperature + seasonality_cloud + 
##     seasonality_temperature + sun_exposure + UV_daily_dose + 
##     UV_irradiance, data = modDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -56.936  -10.095   -1.911    9.052   82.217  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             399.181082  89.142525   4.478 8.77e-06 ***
## anRange_temperature      -0.136452   0.088615  -1.540 0.124045    
## cancer_gov_UV_exposure    0.014442   0.009583   1.507 0.132245    
## mean_cloud               -0.064671   0.240035  -0.269 0.787683    
## elevation                -0.002797   0.004298  -0.651 0.515346    
## mean_temperature         -0.096078   0.076208  -1.261 0.207817    
## seasonality_cloud        -0.823232   0.339750  -2.423 0.015638 *  
## seasonality_temperature  -0.001326   0.002895  -0.458 0.646982    
## sun_exposure             -0.014480   0.012170  -1.190 0.234487    
## UV_daily_dose             0.217959   0.065034   3.351 0.000846 ***
## UV_irradiance            -4.192563   1.268161  -3.306 0.000994 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 299.1332)
## 
##     Null deviance: 240859  on 726  degrees of freedom
## Residual deviance: 214179  on 716  degrees of freedom
## AIC: 6220.6
## 
## Number of Fisher Scoring iterations: 2
env_glm_aic=a$aic
c=summary(a)
env_glm_r2=1-(c$deviance/c$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

library(effects)
for(i in 1:10){
plot(predictorEffects(a)[i],cex=.1)
}

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##      Overall                     Var
## 1  0.2694217              mean_cloud
## 2  0.4581517 seasonality_temperature
## 3  0.6508603               elevation
## 4  1.1898816            sun_exposure
## 5  1.2607303        mean_temperature
## 6  1.5070265  cancer_gov_UV_exposure
## 7  1.5398231     anRange_temperature
## 8  2.4230526       seasonality_cloud
## 9  3.3060182           UV_irradiance
## 10 3.3514550           UV_daily_dose

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_env=sum(x$SEER_rate)/sum(x$predicted)
RMSE_env=RMSE(x$predicted,x$SEER_rate)

Medical GLMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=glm(SEER_rate~incm_pc+wpovr50+incm_mh+derm_pk+pcp_pk+docs_pk+wpvr100,data=modDF)
summary(a)
## 
## Call:
## glm(formula = SEER_rate ~ incm_pc + wpovr50 + incm_mh + derm_pk + 
##     pcp_pk + docs_pk + wpvr100, data = modDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -45.111  -10.902   -1.226    7.840   77.733  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.8331145  4.9162193   4.034 6.06e-05 ***
## incm_pc     -0.0002332  0.0000871  -2.678 0.007578 ** 
## wpovr50      0.3349817  0.1678134   1.996 0.046294 *  
## incm_mh      0.0003333  0.0001139   2.925 0.003551 ** 
## derm_pk      1.5139199  0.4574544   3.309 0.000981 ***
## pcp_pk       0.1067555  0.0379640   2.812 0.005057 ** 
## docs_pk     -0.0286660  0.0128139  -2.237 0.025586 *  
## wpvr100      0.0281144  0.1862127   0.151 0.880034    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 262.8892)
## 
##     Null deviance: 240859  on 726  degrees of freedom
## Residual deviance: 189017  on 719  degrees of freedom
## AIC: 6123.7
## 
## Number of Fisher Scoring iterations: 2
med_glm_aic=a$aic
c=summary(a)
med_glm_r2=1-(c$deviance/c$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

library(effects)
for(i in 1:7){
plot(predictorEffects(a)[i],cex=.1)
}

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##     Overall     Var
## 1 0.1509802 wpvr100
## 2 1.9961555 wpovr50
## 3 2.2370981 docs_pk
## 4 2.6778729 incm_pc
## 5 2.8120166  pcp_pk
## 6 2.9251561 incm_mh
## 7 3.3094446 derm_pk

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_med=sum(x$SEER_rate)/sum(x$predicted)
RMSE_med=RMSE(x$predicted,x$SEER_rate)

Environment and medical GLMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=glm(SEER_rate~incm_pc+wpovr50+incm_mh+derm_pk+pcp_pk+docs_pk+wpvr100+anRange_temperature+cancer_gov_UV_exposure+mean_cloud+elevation+mean_temperature+seasonality_cloud+seasonality_temperature+sun_exposure+UV_daily_dose+UV_irradiance,data=modDF)
summary(a)
## 
## Call:
## glm(formula = SEER_rate ~ incm_pc + wpovr50 + incm_mh + derm_pk + 
##     pcp_pk + docs_pk + wpvr100 + anRange_temperature + cancer_gov_UV_exposure + 
##     mean_cloud + elevation + mean_temperature + seasonality_cloud + 
##     seasonality_temperature + sun_exposure + UV_daily_dose + 
##     UV_irradiance, data = modDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -58.679   -9.766   -0.545    8.086   78.393  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.804e+02  8.156e+01   4.664 3.71e-06 ***
## incm_pc                 -1.085e-04  9.428e-05  -1.151 0.250312    
## wpovr50                  4.951e-01  1.773e-01   2.792 0.005374 ** 
## incm_mh                  4.748e-04  1.252e-04   3.792 0.000162 ***
## derm_pk                  9.326e-01  4.480e-01   2.082 0.037739 *  
## pcp_pk                   8.814e-02  3.724e-02   2.367 0.018198 *  
## docs_pk                 -1.328e-02  1.273e-02  -1.043 0.297343    
## wpvr100                 -4.183e-01  2.221e-01  -1.883 0.060075 .  
## anRange_temperature     -2.532e-02  8.108e-02  -0.312 0.754941    
## cancer_gov_UV_exposure  -2.651e-04  8.685e-03  -0.031 0.975657    
## mean_cloud               5.570e-01  2.289e-01   2.434 0.015192 *  
## elevation               -8.486e-04  3.953e-03  -0.215 0.830082    
## mean_temperature         3.939e-02  7.200e-02   0.547 0.584541    
## seasonality_cloud       -6.875e-01  3.165e-01  -2.172 0.030188 *  
## seasonality_temperature -2.923e-03  2.658e-03  -1.100 0.271805    
## sun_exposure             1.227e-03  1.111e-02   0.110 0.912114    
## UV_daily_dose            2.639e-01  5.909e-02   4.466 9.29e-06 ***
## UV_irradiance           -4.971e+00  1.150e+00  -4.321 1.78e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 240.0592)
## 
##     Null deviance: 240859  on 726  degrees of freedom
## Residual deviance: 170202  on 709  degrees of freedom
## AIC: 6067.5
## 
## Number of Fisher Scoring iterations: 2
envmed_glm_aic=a$aic
c=summary(a)
envmed_glm_r2=1-(c$deviance/c$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

library(effects)
for(i in 1:17){
plot(predictorEffects(a)[i],cex=.1)
}

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##       Overall                     Var
## 1  0.03052533  cancer_gov_UV_exposure
## 2  0.11041185            sun_exposure
## 3  0.21467593               elevation
## 4  0.31225242     anRange_temperature
## 5  0.54701394        mean_temperature
## 6  1.04291545                 docs_pk
## 7  1.09977129 seasonality_temperature
## 8  1.15053259                 incm_pc
## 9  1.88325468                 wpvr100
## 10 2.08157945                 derm_pk
## 11 2.17198227       seasonality_cloud
## 12 2.36705777                  pcp_pk
## 13 2.43367340              mean_cloud
## 14 2.79230559                 wpovr50
## 15 3.79171390                 incm_mh
## 16 4.32096854           UV_irradiance
## 17 4.46560857           UV_daily_dose

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_envmed=sum(x$SEER_rate)/sum(x$predicted)
RMSE_envmed=RMSE(x$predicted,x$SEER_rate)

GLM inter-model comparisons

Some AIC principles:

  1. Lower indicates a more parsimonious model, relative to a model fit with a higher AIC.

  2. It is a relative measure of model parsimony, so it only has meaning if we compare the AIC for alternate hypotheses (= different models of the data).

Some RMSE principles:
RMSE (residual mean square error) represents the model prediction error, that is the average difference the observed outcome values and the predicted outcome values.

AIC=list(env_glm_aic,med_glm_aic,envmed_glm_aic)
r2=list(env_glm_r2,med_glm_r2,envmed_glm_r2)
ratio=list(ratio_env,ratio_med,ratio_envmed)
RMSE=list(RMSE_env,RMSE_med,RMSE_envmed)
names=list("Environment","Medical","Combined")

dat=data.frame(AIC=as.numeric(AIC),
               r2=as.numeric(r2),
               ratio=as.numeric(ratio),
               RMSE=as.numeric(RMSE),
               model=as.character(names),
               stringsAsFactors = F)
dat
##        AIC        r2 ratio     RMSE       model
## 1 6220.599 0.1107697     1 17.16412 Environment
## 2 6123.742 0.2152378     1 16.12440     Medical
## 3 6067.514 0.2933552     1 15.30083    Combined

Generalized linear models with quadratic terms

I am not sure if this is the best way to do this, but I’m trying to match your interest of only allowing model responses to curve once. Normally I allow models to wiggle a lot more than this. Essentially these are the exact same models as above, but they can follow a quadratic function: http://dl.uncw.edu/digilib/Mathematics/Algebra/mat111hb/PandR/quadratic/quadratic.html

Environmental quadratic GLMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=glm(SEER_rate~I(anRange_temperature^2)+I(cancer_gov_UV_exposure^2)+I(mean_cloud^2)+I(elevation^2)+I(mean_temperature^2)+I(seasonality_cloud^2)+I(seasonality_temperature^2)+I(sun_exposure^2)+I(UV_daily_dose^2)+I(UV_irradiance^2),data=modDF)
summary(a)
## 
## Call:
## glm(formula = SEER_rate ~ I(anRange_temperature^2) + I(cancer_gov_UV_exposure^2) + 
##     I(mean_cloud^2) + I(elevation^2) + I(mean_temperature^2) + 
##     I(seasonality_cloud^2) + I(seasonality_temperature^2) + I(sun_exposure^2) + 
##     I(UV_daily_dose^2) + I(UV_irradiance^2), data = modDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -49.520  -10.542   -2.127    8.555   77.322  
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.526e+01  1.613e+01   5.287 1.65e-07 ***
## I(anRange_temperature^2)     -1.740e-04  1.499e-04  -1.161  0.24604    
## I(cancer_gov_UV_exposure^2)   1.482e-06  9.949e-07   1.489  0.13688    
## I(mean_cloud^2)              -7.084e-03  1.518e-03  -4.668 3.64e-06 ***
## I(elevation^2)                1.328e-06  1.388e-06   0.956  0.33932    
## I(mean_temperature^2)        -7.184e-04  2.426e-04  -2.961  0.00317 ** 
## I(seasonality_cloud^2)       -2.364e-02  1.419e-02  -1.665  0.09631 .  
## I(seasonality_temperature^2) -1.476e-07  2.075e-07  -0.712  0.47698    
## I(sun_exposure^2)            -7.393e-06  3.119e-06  -2.370  0.01803 *  
## I(UV_daily_dose^2)           -1.009e-05  5.695e-06  -1.772  0.07689 .  
## I(UV_irradiance^2)            1.477e-03  1.071e-03   1.379  0.16831    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 292.6268)
## 
##     Null deviance: 240859  on 726  degrees of freedom
## Residual deviance: 209521  on 716  degrees of freedom
## AIC: 6204.6
## 
## Number of Fisher Scoring iterations: 2
env_glm_aic=a$aic
c=summary(a)
env_glm_r2=1-(c$deviance/c$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

library(effects)
for(i in 1:10){
plot(predictorEffects(a)[i],cex=.1)
}

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##      Overall                          Var
## 1  0.7115420 I(seasonality_temperature^2)
## 2  0.9561441               I(elevation^2)
## 3  1.1609668     I(anRange_temperature^2)
## 4  1.3790514           I(UV_irradiance^2)
## 5  1.4891887  I(cancer_gov_UV_exposure^2)
## 6  1.6652203       I(seasonality_cloud^2)
## 7  1.7715736           I(UV_daily_dose^2)
## 8  2.3704288            I(sun_exposure^2)
## 9  2.9611259        I(mean_temperature^2)
## 10 4.6676791              I(mean_cloud^2)

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_env=sum(x$SEER_rate)/sum(x$predicted)
RMSE_env=RMSE(x$predicted,x$SEER_rate)

Medical quadratic GLMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=glm(SEER_rate~I(incm_pc^2)+I(wpovr50^2)+I(incm_mh^2)+I(derm_pk^2)+I(pcp_pk^2)+I(docs_pk^2)+I(wpvr100^2),data=modDF)
summary(a)
## 
## Call:
## glm(formula = SEER_rate ~ I(incm_pc^2) + I(wpovr50^2) + I(incm_mh^2) + 
##     I(derm_pk^2) + I(pcp_pk^2) + I(docs_pk^2) + I(wpvr100^2), 
##     data = modDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -45.888  -10.647   -1.455    8.413   80.374  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.953e+01  2.366e+00  12.481  < 2e-16 ***
## I(incm_pc^2) -6.348e-10  7.467e-10  -0.850   0.3955    
## I(wpovr50^2)  5.918e-03  1.375e-03   4.302 1.92e-05 ***
## I(incm_mh^2)  1.442e-09  1.032e-09   1.397   0.1630    
## I(derm_pk^2)  5.821e-02  3.548e-02   1.641   0.1013    
## I(pcp_pk^2)   5.734e-04  2.222e-04   2.581   0.0101 *  
## I(docs_pk^2) -3.008e-05  1.600e-05  -1.880   0.0605 .  
## I(wpvr100^2) -2.814e-03  3.162e-03  -0.890   0.3737    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 268.3403)
## 
##     Null deviance: 240859  on 726  degrees of freedom
## Residual deviance: 192937  on 719  degrees of freedom
## AIC: 6138.7
## 
## Number of Fisher Scoring iterations: 2
med_glm_aic=a$aic
c=summary(a)
med_glm_r2=1-(c$deviance/c$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

library(effects)
for(i in 1:7){
plot(predictorEffects(a)[i],cex=.1)
}

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##     Overall          Var
## 1 0.8501913 I(incm_pc^2)
## 2 0.8901693 I(wpvr100^2)
## 3 1.3965039 I(incm_mh^2)
## 4 1.6407766 I(derm_pk^2)
## 5 1.8801337 I(docs_pk^2)
## 6 2.5806500  I(pcp_pk^2)
## 7 4.3022989 I(wpovr50^2)

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_med=sum(x$SEER_rate)/sum(x$predicted)
RMSE_med=RMSE(x$predicted,x$SEER_rate)

Environment and medical quadratic GLMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=glm(SEER_rate~I(incm_pc^2)+I(wpovr50^2)+I(incm_mh^2)+I(derm_pk^2)+I(pcp_pk^2)+I(docs_pk^2)+I(wpvr100^2)+I(anRange_temperature^2)+I(cancer_gov_UV_exposure^2)+I(mean_cloud^2)+I(elevation^2)+I(mean_temperature^2)+I(seasonality_cloud^2)+I(seasonality_temperature^2)+I(sun_exposure^2)+I(UV_daily_dose^2)+I(UV_irradiance^2),data=modDF)
summary(a)
## 
## Call:
## glm(formula = SEER_rate ~ I(incm_pc^2) + I(wpovr50^2) + I(incm_mh^2) + 
##     I(derm_pk^2) + I(pcp_pk^2) + I(docs_pk^2) + I(wpvr100^2) + 
##     I(anRange_temperature^2) + I(cancer_gov_UV_exposure^2) + 
##     I(mean_cloud^2) + I(elevation^2) + I(mean_temperature^2) + 
##     I(seasonality_cloud^2) + I(seasonality_temperature^2) + I(sun_exposure^2) + 
##     I(UV_daily_dose^2) + I(UV_irradiance^2), data = modDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -54.317   -9.602   -0.842    8.188   75.334  
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   1.081e+01  1.618e+01   0.668 0.504404    
## I(incm_pc^2)                  5.665e-10  7.596e-10   0.746 0.456074    
## I(wpovr50^2)                  9.161e-03  1.405e-03   6.520 1.34e-10 ***
## I(incm_mh^2)                  2.124e-09  1.111e-09   1.911 0.056422 .  
## I(derm_pk^2)                  1.885e-02  3.431e-02   0.549 0.582968    
## I(pcp_pk^2)                   4.593e-04  2.140e-04   2.147 0.032150 *  
## I(docs_pk^2)                 -9.458e-06  1.556e-05  -0.608 0.543589    
## I(wpvr100^2)                 -1.401e-02  3.710e-03  -3.775 0.000173 ***
## I(anRange_temperature^2)      5.755e-05  1.385e-04   0.415 0.677938    
## I(cancer_gov_UV_exposure^2)  -5.054e-07  9.308e-07  -0.543 0.587338    
## I(mean_cloud^2)              -8.118e-04  1.528e-03  -0.531 0.595479    
## I(elevation^2)               -7.988e-07  1.270e-06  -0.629 0.529609    
## I(mean_temperature^2)        -9.185e-04  2.239e-04  -4.102 4.56e-05 ***
## I(seasonality_cloud^2)       -1.149e-02  1.335e-02  -0.860 0.389923    
## I(seasonality_temperature^2) -3.519e-07  1.909e-07  -1.843 0.065712 .  
## I(sun_exposure^2)             8.655e-07  2.917e-06   0.297 0.766764    
## I(UV_daily_dose^2)           -2.972e-05  5.488e-06  -5.416 8.35e-08 ***
## I(UV_irradiance^2)            5.567e-03  1.044e-03   5.330 1.32e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 238.4376)
## 
##     Null deviance: 240859  on 726  degrees of freedom
## Residual deviance: 169052  on 709  degrees of freedom
## AIC: 6062.6
## 
## Number of Fisher Scoring iterations: 2
envmed_glm_aic=a$aic
c=summary(a)
envmed_glm_r2=1-(c$deviance/c$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

library(effects)
for(i in 1:17){
plot(predictorEffects(a)[i],cex=.1)
}

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##      Overall                          Var
## 1  0.2967236            I(sun_exposure^2)
## 2  0.4154509     I(anRange_temperature^2)
## 3  0.5311536              I(mean_cloud^2)
## 4  0.5429457  I(cancer_gov_UV_exposure^2)
## 5  0.5493062                 I(derm_pk^2)
## 6  0.6076885                 I(docs_pk^2)
## 7  0.6289122               I(elevation^2)
## 8  0.7457355                 I(incm_pc^2)
## 9  0.8602838       I(seasonality_cloud^2)
## 10 1.8432374 I(seasonality_temperature^2)
## 11 1.9108884                 I(incm_mh^2)
## 12 2.1467680                  I(pcp_pk^2)
## 13 3.7748888                 I(wpvr100^2)
## 14 4.1024103        I(mean_temperature^2)
## 15 5.3301649           I(UV_irradiance^2)
## 16 5.4160841           I(UV_daily_dose^2)
## 17 6.5198669                 I(wpovr50^2)

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_envmed=sum(x$SEER_rate)/sum(x$predicted)
RMSE_envmed=RMSE(x$predicted,x$SEER_rate)

quadratic GLM inter-model comparisons

Some AIC principles:

  1. Lower indicates a more parsimonious model, relative to a model fit with a higher AIC.

  2. It is a relative measure of model parsimony, so it only has meaning if we compare the AIC for alternate hypotheses (= different models of the data).

Some RMSE principles:
RMSE (residual mean square error) represents the model prediction error, that is the average difference the observed outcome values and the predicted outcome values.

AIC=list(env_glm_aic,med_glm_aic,envmed_glm_aic)
r2=list(env_glm_r2,med_glm_r2,envmed_glm_r2)
ratio=list(ratio_env,ratio_med,ratio_envmed)
RMSE=list(RMSE_env,RMSE_med,RMSE_envmed)
names=list("Environment","Medical","Combined")

dat=data.frame(AIC=as.numeric(AIC),
               r2=as.numeric(r2),
               ratio=as.numeric(ratio),
               RMSE=as.numeric(RMSE),
               model=as.character(names),
               stringsAsFactors = F)
dat
##        AIC        r2 ratio     RMSE       model
## 1 6204.611 0.1301115     1 16.97643 Environment
## 2 6138.662 0.1989654     1 16.29072     Medical
## 3 6062.586 0.2981285     1 15.24907    Combined

Generalized additive models (GAMs)

These are going to use “shrinkage” to assess the optimal wiggliness of model fits. Wiggle are fit using thin plate regression splines. These models will also effectively remove predictors from the model if they are not contributing significantly.

Environmental GAMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=gam(SEER_rate~s(anRange_temperature,bs="ts")+s(cancer_gov_UV_exposure,bs="ts")+s(mean_cloud,bs="ts")+s(elevation,bs="ts")+s(mean_temperature,bs="ts")+s(seasonality_cloud,bs="ts")+s(seasonality_temperature,bs="ts")+s(sun_exposure,bs="ts")+s(UV_daily_dose,bs="ts")+s(UV_irradiance,bs="ts"),data=modDF)
summary(a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## SEER_rate ~ s(anRange_temperature, bs = "ts") + s(cancer_gov_UV_exposure, 
##     bs = "ts") + s(mean_cloud, bs = "ts") + s(elevation, bs = "ts") + 
##     s(mean_temperature, bs = "ts") + s(seasonality_cloud, bs = "ts") + 
##     s(seasonality_temperature, bs = "ts") + s(sun_exposure, bs = "ts") + 
##     s(UV_daily_dose, bs = "ts") + s(UV_irradiance, bs = "ts")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.4469     0.5312   91.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                              edf Ref.df     F  p-value    
## s(anRange_temperature)     7.306      9 3.662 2.40e-06 ***
## s(cancer_gov_UV_exposure)  1.835      9 0.118 0.522023    
## s(mean_cloud)              8.425      9 3.761 1.02e-05 ***
## s(elevation)               1.248      9 1.162 0.000271 ***
## s(mean_temperature)        7.761      9 4.023 7.65e-07 ***
## s(seasonality_cloud)       5.303      9 1.891 0.001379 ** 
## s(seasonality_temperature) 7.624      9 3.449 6.66e-06 ***
## s(sun_exposure)            2.434      9 0.602 0.048073 *  
## s(UV_daily_dose)           8.178      9 9.153  < 2e-16 ***
## s(UV_irradiance)           0.477      9 0.077 0.194512    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.382   Deviance explained = 42.5%
## GCV = 220.79  Scale est. = 205.12    n = 727
env_glm_aic=AIC(a)
env_glm_r2=1-(a$deviance/a$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

plot(a,pages=1)

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##       Overall                     Var
## 1   0.2823104  cancer_gov_UV_exposure
## 2   0.7110526           UV_irradiance
## 3   1.3180956            sun_exposure
## 4   2.8604232       seasonality_cloud
## 5   3.5667163               elevation
## 6   4.9928597              mean_cloud
## 7   5.1765637 seasonality_temperature
## 8   5.6192172     anRange_temperature
## 9   6.1163033        mean_temperature
## 10 19.4286349           UV_daily_dose

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_env=sum(x$SEER_rate)/sum(x$predicted)
RMSE_env=RMSE(x$predicted,x$SEER_rate)

Medical GAMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=gam(SEER_rate~s(incm_pc,bs="ts")+s(wpovr50,bs="ts")+s(incm_mh,bs="ts")+s(derm_pk,bs="ts")+s(pcp_pk,bs="ts")+s(docs_pk,bs="ts")+s(wpvr100,bs="ts"),data=modDF)
summary(a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## SEER_rate ~ s(incm_pc, bs = "ts") + s(wpovr50, bs = "ts") + s(incm_mh, 
##     bs = "ts") + s(derm_pk, bs = "ts") + s(pcp_pk, bs = "ts") + 
##     s(docs_pk, bs = "ts") + s(wpvr100, bs = "ts")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.4469     0.5836   83.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df     F  p-value    
## s(incm_pc) 4.9313      9 2.980 8.01e-06 ***
## s(wpovr50) 4.2884      9 1.386 0.003095 ** 
## s(incm_mh) 1.6059      9 0.912 0.005037 ** 
## s(derm_pk) 0.9919      9 1.119 0.000837 ***
## s(pcp_pk)  4.9516      9 2.660 4.60e-05 ***
## s(docs_pk) 0.7975      9 0.366 0.038217 *  
## s(wpvr100) 3.4613      9 0.876 0.026058 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.254   Deviance explained = 27.5%
## GCV = 255.35  Scale est. = 247.62    n = 727
med_glm_aic=AIC(a)
med_glm_r2=1-(a$deviance/a$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

plot(a,pages=1)

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##    Overall     Var
## 1 1.417746 docs_pk
## 2 1.584052 wpvr100
## 3 2.297864 incm_mh
## 4 2.509334 wpovr50
## 5 3.077054 derm_pk
## 6 4.337546  pcp_pk
## 7 5.096382 incm_pc

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_med=sum(x$SEER_rate)/sum(x$predicted)
RMSE_med=RMSE(x$predicted,x$SEER_rate)

Environment and medical GAMs

Summary

How to interpret summary: https://stats.stackexchange.com/questions/86351/interpretation-of-rs-output-for-binomial-regression
(Don’t worry about this)

library(mgcv)
a=gam(SEER_rate~s(anRange_temperature,bs="ts")+s(cancer_gov_UV_exposure,bs="ts")+s(mean_cloud,bs="ts")+s(elevation,bs="ts")+s(mean_temperature,bs="ts")+s(seasonality_cloud,bs="ts")+s(seasonality_temperature,bs="ts")+s(sun_exposure,bs="ts")+s(UV_daily_dose,bs="ts")+s(UV_irradiance,bs="ts")+s(incm_pc,bs="ts")+s(wpovr50,bs="ts")+s(incm_mh,bs="ts")+s(derm_pk,bs="ts")+s(pcp_pk,bs="ts")+s(docs_pk,bs="ts")+s(wpvr100,bs="ts"),data=modDF)
summary(a)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## SEER_rate ~ s(anRange_temperature, bs = "ts") + s(cancer_gov_UV_exposure, 
##     bs = "ts") + s(mean_cloud, bs = "ts") + s(elevation, bs = "ts") + 
##     s(mean_temperature, bs = "ts") + s(seasonality_cloud, bs = "ts") + 
##     s(seasonality_temperature, bs = "ts") + s(sun_exposure, bs = "ts") + 
##     s(UV_daily_dose, bs = "ts") + s(UV_irradiance, bs = "ts") + 
##     s(incm_pc, bs = "ts") + s(wpovr50, bs = "ts") + s(incm_mh, 
##     bs = "ts") + s(derm_pk, bs = "ts") + s(pcp_pk, bs = "ts") + 
##     s(docs_pk, bs = "ts") + s(wpvr100, bs = "ts")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  48.4469     0.4568   106.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                  edf Ref.df     F  p-value    
## s(anRange_temperature)     7.747e+00      9 5.195 1.67e-08 ***
## s(cancer_gov_UV_exposure)  4.836e+00      9 1.071 0.041169 *  
## s(mean_cloud)              8.170e+00      9 4.495 5.70e-07 ***
## s(elevation)               1.405e+00      9 2.355 1.05e-06 ***
## s(mean_temperature)        8.571e+00      9 7.190 1.29e-11 ***
## s(seasonality_cloud)       5.334e+00      9 1.527 0.008824 ** 
## s(seasonality_temperature) 7.027e+00      9 3.783 3.33e-06 ***
## s(sun_exposure)            1.116e-06      9 0.000 0.508896    
## s(UV_daily_dose)           8.295e+00      9 3.725 1.54e-05 ***
## s(UV_irradiance)           8.658e+00      9 3.052 0.000238 ***
## s(incm_pc)                 5.956e-01      9 0.164 0.107798    
## s(wpovr50)                 1.539e-07      9 0.000 0.975020    
## s(incm_mh)                 7.612e-01      9 0.573 0.007799 ** 
## s(derm_pk)                 3.447e-05      9 0.000 0.427150    
## s(pcp_pk)                  7.518e+00      9 3.615 2.24e-05 ***
## s(docs_pk)                 2.646e-05      9 0.000 0.453685    
## s(wpvr100)                 7.199e+00      9 3.981 1.48e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.543   Deviance explained = 59.1%
## GCV = 169.72  Scale est. = 151.72    n = 727
envmed_glm_aic=AIC(a)
envmed_glm_r2=1-(a$deviance/a$null.deviance)

Predictor effects

This is the relationship between each predictor and SEER_rate, holding all other variables at their average value You can read more about it here: https://cran.r-project.org/web/packages/effects/vignettes/predictor-effects-gallery.pdf
(Don’t worry about this)

plot(a,pages=1)

Variable importance

You can read more about how this is caluclated here: https://topepo.github.io/caret/variable-importance.html
(Don’t worry about this)

library(caret)
b=varImp(a) %>% mutate(Var=rownames(.))
arrange(b,Overall)
##        Overall                     Var
## 1   0.01098665                 wpovr50
## 2   0.29337106            sun_exposure
## 3   0.34324602                 docs_pk
## 4   0.36941947                 derm_pk
## 5   0.96738759                 incm_pc
## 6   1.38542445  cancer_gov_UV_exposure
## 7   2.05433312       seasonality_cloud
## 8   2.10795161                 incm_mh
## 9   3.62282430           UV_irradiance
## 10  4.64940077                  pcp_pk
## 11  4.81234109           UV_daily_dose
## 12  5.47767891 seasonality_temperature
## 13  5.82842867                 wpvr100
## 14  5.97778796               elevation
## 15  6.24429079              mean_cloud
## 16  7.77761027     anRange_temperature
## 17 10.88884578        mean_temperature

observed versus predicted

b=predict(a,modDF)
x=modDF %>% dplyr::select(SEER_rate) %>% mutate(predicted=b) 
ggplot(x)+geom_density(aes(SEER_rate,color="Observed"))+geom_density(aes(predicted,color="Predicted"))

ratio_envmed=sum(x$SEER_rate)/sum(x$predicted)
RMSE_envmed=RMSE(x$predicted,x$SEER_rate)

GAM inter-model comparisons

Some AIC principles:

  1. Lower indicates a more parsimonious model, relative to a model fit with a higher AIC.

  2. It is a relative measure of model parsimony, so it only has meaning if we compare the AIC for alternate hypotheses (= different models of the data).

Some RMSE principles:
RMSE (residual mean square error) represents the model prediction error, that is the average difference the observed outcome values and the predicted outcome values.

AIC=list(env_glm_aic,med_glm_aic,envmed_glm_aic)
r2=list(env_glm_r2,med_glm_r2,envmed_glm_r2)
ratio=list(ratio_env,ratio_med,ratio_envmed)
RMSE=list(RMSE_env,RMSE_med,RMSE_envmed)
names=list("Environment","Medical","Combined")

dat=data.frame(AIC=as.numeric(AIC),
               r2=as.numeric(r2),
               ratio=as.numeric(ratio),
               RMSE=as.numeric(RMSE),
               model=as.character(names),
               stringsAsFactors = F)
dat
##        AIC        r2 ratio     RMSE       model
## 1 5985.071 0.4247986     1 13.80462 Environment
## 2 6093.957 0.2752547     1 15.49556     Medical
## 3 5788.859 0.5906384     1 11.64577    Combined